/**
 * @author Fang Lu, fanglupku@gmail.com
 * 2011-3-23
 */
package data.processor;


public class PageContentExtractor {
	public static final int NO_MERGE = 0;
	public static final int CONTINUAL_MERGE = 0;

	public static String pretreatPageContent(String pageContent) {
		String text = ExceptionPageDealer.eraseAllComment(pageContent);
		text = ExceptionPageDealer.eraseOneTagAndContent(text, "head");
		text = ExceptionPageDealer.eraseOneTagAndContent(text, "script");
		text = ExceptionPageDealer.eraseOneTagAndContent(text, "style");
		text = text.replaceAll("<br((\\s*?)|(\\s+?.*?))>", "\n");
		return new String(text);
	}

	public static String extractTextFromPage(String pageContent, int mergeModel) {
		String text = pretreatPageContent(pageContent);
		text = ExceptionPageDealer.eraseAllTags(text);
		text = HTMLEntityTransform.getResultString(text);
		return text;
	}
}
